rm(list=ls())
library(ggpubr)
library(TCGAbiolinks)
library(reshape)
#data("ToothGrowth")
#p <- ggboxplot(ToothGrowth, x = "supp", y = "len",
#               color = "supp", palette = "jco",
#               add = "jitter",
#               facet.by = "dose", short.panel.labs = FALSE,yscale="log2")
# Use only p.format as label. Remove method name.
#p + stat_compare_means(label = "p.format")


projects = TCGAbiolinks:::getGDCprojects()$project_id
ind = grep("TCGA",projects)
projects=projects[ind]

path = "Z:/Bioinformatics/ExternalDatabases/TCGAbiolinksAnalysis/UnnormalizedData/"
stor=c()
#Gene = c("COL18A1","COL16A1","COL14A1","COL10A1","COL9A3","COL9A2","COL8A1","COL6A5","COL6A2","COL6A1","COL4A6","COL4A3","COL4A1","COL3A1","COL2A1","COL27A1","COL26A1","COL25A1","COl20A1","COL1A2","COL1A1")
#Gene="LAIR1"
Gene = c("COL1A1","COL1A2","COL2A1","COL3A1","COL4A1","COL4A2","COL4A3","COL4A4","COL4A5","COL4A6","COL5A1","COL5A2","COL5A3","COL6A1","COL6A2","COL6A3","COL6A5","COL7A1","COL8A1","COL8A2","COL9A1","COL9A2","COL9A3","COL10A1","COL11A1","COL11A2","COL12A1","COL13A1","COL14A1","COL15A1","COL16A1","COL17A1","COL18A1","COL19A1","COL20A1","COL21A1","COL22A1","COL23A1","COL24A1","COL25A1","EMID2","COL27A1","COL28A1","COL29A1")

for(project in projects){
  datPath = paste(path,paste(project,"Data.csv",sep=""),sep="/")
  metDatPath = paste(path,paste(project,"Metadata.csv",sep=""),sep="/")
  metadata = read.csv(metDatPath)
  dat = read.csv(datPath)
  
  ind = which(metadata$tissue.definition=="Solid Tissue Normal")
  norm = metadata$cases[ind]
  norm = unlist(lapply(norm,function(x){aa=gsub("-","\\.",x);return(aa)}))
  ind = which(metadata$tissue.definition=="Primary solid Tumor")
  tumor = metadata$cases[ind]
  tumor = unlist(lapply(tumor,function(x){aa=gsub("-","\\.",x);return(aa)}))
  flg=0
  if(length(norm)>2){
    flg=flg+1
    ind = which(colnames(dat)%in%norm)
    normDat = dat[,ind]
  }
  
  if(length(tumor)>2){
    flg=flg+1
    ind = which(colnames(dat)%in%tumor)
    tumorDat = dat[,ind]
  }
  genes = apply(as.matrix(dat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  ind = which(genes%in%Gene)
  
  if(length(ind>0)&(flg==2)){
    norMat = cbind(project,t(normDat[ind,]),"Normal")
    row.names(norMat)=NULL
    tumMat = cbind(project,t(tumorDat[ind,]),"Tumor")
    row.names(tumMat)=NULL
    stor=rbind(stor,norMat)
    stor=rbind(stor,tumMat)
  }
}

stor = as.data.frame(stor)
colnames(stor) = c("proj",genes[ind],"DiseaseState")
numericDat = stor



numericDat <- data.frame(apply(numericDat, 2, function(x) as.numeric(as.character(x))))
numericDat$proj = stor$proj
numericDat$DiseaseState = stor$DiseaseState

ind = which(colnames(numericDat)%in%Gene)

numericDat.reshaped = melt(numericDat,id.vars=colnames(numericDat)[-ind])


ind = which(is.na(numericDat.reshaped$value))
if(length(ind)>0){
  numericDat.reshaped = numericDat.reshaped[-ind,]
}
numericDat.reshaped$value = log2(numericDat.reshaped$value+1)


#Average values in each individual before plotting
numericDat = stor
numericDat$proj=NULL
numericDat$DiseaseState=NULL
numericDat <- data.frame(apply(numericDat, 2, function(x) as.numeric(as.character(x))))
numericDat <- apply(as.matrix(numericDat),1,function(x){mn = mean(x,na.rm = T);return(mn)})

numericDat = data.frame(numericDat)
numericDat=log2(numericDat+1)
numericDat$proj=stor$proj
numericDat$DiseaseState=stor$DiseaseState

#p <- ggboxplot(numericDat, x = "DiseaseState", y = "numericDat",color = "DiseaseState", palette =c("#006400", "#8B0000"),add = "jitter",facet.by = "proj",scales="free", short.panel.labs = FALSE, xlab="Tissue Type",ylab="TPM")
# Use only p.format as label. Remove method name.
#p + stat_compare_means(show.legend=FALSE, label.x.npc = 0.5,label.y.npc = 0.93, color = "black", size = 4)

write.csv(numericDat,"C:/Users/ShaikJ/Desktop/Figs/collagen.csv")

p=ggplot(numericDat,aes(DiseaseState,numericDat,fill=DiseaseState))+facet_wrap(.~proj,scales='free')+stat_compare_means(show.legend=FALSE,label.x.npc = 0.2,label.y.npc = 0.93,color="black",size=3)+geom_boxplot(lwd=1)+labs(x="Disease State",y="log2(TPM+1)")+scale_fill_manual(breaks = c("Normal", "Tumor"),values=c("green", "red"))+theme(panel.background = element_rect(fill = "white",colour = "white",size = 0.5, linetype = "solid"),text = element_text(size = 20,face="bold"))


png("U:/NC410Manuscript/Results/CollagenExpressionVariousCancersAllCollagens.png", width = 14, height = 9, units = 'in', res = 300)
p
dev.off()

projects = unique(numericDat$proj)
strj=c()

for(ii in projects){
  ind = which(numericDat$proj==ii)
  dat = numericDat[ind,]
  ind = which(dat$DiseaseState=="Normal")
  dat1 = dat$numericDat[ind]
  dat1avg = mean(dat1)
  ind = which(dat$DiseaseState=="Tumor")
  dat2 = dat$numericDat[ind]
  dat2avg = mean(dat2)
  
  datall = rbind(dat1avg,dat2avg)
  datall = data.frame(datall)
  datall$proj = dat$proj[1]
  datall$DiseaseState = c("Normal","Tumor")
  strj=rbind(strj,datall)
}

strj$proj=gsub("TCGA-","",strj$proj)



p=ggplot(strj,aes(x=DiseaseState ,y=datall,fill=DiseaseState))+geom_bar(stat="identity")+scale_fill_manual(breaks = c("Normal", "Tumor"),values=rev(gray.colors(2)))+theme(legend.position = "none",panel.background = element_rect(fill = "white",colour = "white",size = 0.5, linetype = "solid"),text = element_text(size = 20,face="bold"),axis.text.x=element_text(angle=90),axis.text.y=element_text(color="black"))+ facet_grid(cols = vars(proj),scales="free")+xlab("Tissue") + ylab("Median log2(TPM+1)")+ coord_cartesian(ylim=c(8,15))

png("U:/NC410Manuscript/Results/CollagenExpressionBarplotMedian.png", width = 20, height = 5, units = 'in', res = 300)
p
dev.off()
